# Gathering nameplate capacity, county, and nerc regions for OGE plants 
library(pacman)
p_load(
  here, fst, data.table, janitor, readxl, tidyr, magrittr,
  tigris, purrr, sf, skimr, stringr, collapse, dplyr
)
options(tigris_use_cache = TRUE)

clean_plant_info = function(){
# Loading data ----------------------------------------------------------------
  print('loading data')
  # Loading oge plant info table 
  oge_plant_info_dt_raw = 
    rbind(
      fread(here(
        'Data/electricity-generation/open-grid-emissions/v0.3.0/plant_static_attributes_2019.csv'
      ))[,year:=2019],
      fread(here(
        'Data/electricity-generation/open-grid-emissions/v0.3.0/plant_static_attributes_2020.csv'
      ))[,year:=2020]
    ) |> setkey(plant_id_eia)
  # eGRID PLant data for unit -> balancing authority, and lat/lon
  egrid_plant_dt = 
    rbind(
      read_xlsx(
        here("Data/electricity-generation/egrid/egrid2019_data.xlsx"),
        sheet = "PLNT19",
        skip = 1
      ) |> 
        clean_names() |> 
        data.table() %>% 
        .[,.(
          plant_id_camd = orispl,
          nerc_egrid = nerc,
          egrid_subregion = subrgn,
          year = 2019,
          lat_egrid = lat, lon_egrid = lon
        )],
      read_xlsx(
        here("Data/electricity-generation/egrid/egrid2020_data.xlsx"),
        sheet = "PLNT20",
        skip = 1
      ) |> 
        clean_names() |> 
        data.table() %>% 
        .[,.(
          plant_id_camd = orispl,
          nerc_egrid = nerc,
          egrid_subregion = subrgn,
          year = 2020,
          lat_egrid = lat, lon_egrid = lon
        )]
    )
  # eGRID Generator ID's (has namepcap)
  egrid_gen_dt = 
    rbind(
      read_xlsx(
        here("Data/electricity-generation/egrid/egrid2019_data.xlsx"),
        sheet = "GEN19",
        skip = 1
      ) |> 
        clean_names() |> 
        data.table() %>% 
        .[,.(
          plant_id_camd = orispl,
          genid,
          year,
          namepcap_egrid = namepcap
        )],
      read_xlsx(
        here("Data/electricity-generation/egrid/egrid2020_data.xlsx"),
        sheet = "GEN20",
        skip = 1
      ) |> 
        clean_names() |> 
        data.table() %>% 
        .[,.(
          plant_id_camd = orispl,
          genid,
          year,
          namepcap_egrid = namepcap
        )]
    )
  # EIA 860 form 
  eia860_plant_dt = 
    rbind(
      read_xlsx(
        here("Data/electricity-generation/eia860/eia8602019/2___Plant_Y2019.xlsx"),
        skip = 1
      ) |>
        data.table() %>% 
        .[,.(
          year = 2019, 
          plant_id_eia = `Plant Code`,
          nerc_eia = `NERC Region`,
          lat_eia = ifelse(`Plant Code` == "62262", 42.89903, Latitude),
          lon_eia = Longitude
        )],
      read_xlsx(
        here("Data/electricity-generation/eia860/eia8602020/2___Plant_Y2020.xlsx"),
        skip = 1
      ) |>
        data.table() %>% 
        .[,.(
          year = 2020, 
          plant_id_eia = `Plant Code`,
          nerc_eia = `NERC Region`,
          lat_eia = ifelse(`Plant Code` == "62262", 42.89903, Latitude),
          lon_eia = Longitude
        )]
    )
  # EIA generator level data 
  eia860_gen_dt = 
    rbind(
      read_xlsx(
        here("Data/electricity-generation/eia860/eia8602019/3_1_Generator_Y2019.xlsx"),
        skip = 1
      ) |>
        data.table() %>% 
        .[,.(
          year = 2019, 
          plant_id_eia = `Plant Code`,
          eia_generator_id = `Generator ID`,
          namepcap_eia = `Nameplate Capacity (MW)`
        )],
      read_xlsx(
        here("Data/electricity-generation/eia860/eia8602020/3_1_Generator_Y2020.xlsx"),
        skip = 1
      ) |>
        data.table() %>% 
        .[,.(
          year = 2020, 
          plant_id_eia = `Plant Code`,
          eia_generator_id = `Generator ID`,
          namepcap_eia = `Nameplate Capacity (MW)`
        )]
    )
  # Stack heights 
  stack_height_dt = 
    fread(
      here("Data/electricity-generation/plant-info/StackHeightInfo(20191202).csv")
    )[,.(
      plant_id_camd = `ORIS CODE`,
      unitid = `LOCATION IDENTIFIER`,
      stack_height = `STACK HEIGHT`
    )] %>%  
    unique() %>% 
    separate_rows(unitid, sep = ',') %>% 
    data.table() %>% 
    .[,.(
      stack_height = median(stack_height, na.rm =TRUE)),
      keyby = .(plant_id_camd)
    ]

# Aggregating unit to plant ---------------------------------------------------
  print('merging data')
  egrid_plant_dt = 
    merge(
      egrid_plant_dt[,-'namepcap_egrid'],
      egrid_gen_dt[,
        .(namepcap_egrid = sum(namepcap_egrid)), 
        keyby = .(plant_id_camd,year)
      ],
      by = c('plant_id_camd', 'year'),
      all = TRUE
    )
  eia860_plant_dt = 
    merge(
      eia860_plant_dt[,-'namepcap_eia'],
      eia860_gen_dt[,
        .(namepcap_eia = sum(namepcap_eia)),
        keyby = .(plant_id_eia, year)
      ],
      by = c('plant_id_eia', 'year'),
      all = TRUE
    )[!is.na(plant_id_eia)]

# Merging eia, egrid, oge data ------------------------------------------------
  oge_plant_info_dt = 
    merge(
      oge_plant_info_dt_raw,
      eia860_plant_dt,
      by = c('plant_id_eia','year'),
      all.x = TRUE
    ) |>
    merge(
      egrid_plant_dt,
      by.x = c('plant_id_eia','year'),
      by.y = c('plant_id_camd','year'),
      all.x = TRUE
    ) |> 
    merge(
      stack_height_dt,
      by.x = c("plant_id_eia"),
      by.y = c("plant_id_camd"),
      all.x = TRUE
    ) %>% 
    .[,.(
      plant_id_eia, year, 
      ba_code, 
      ba_code_physical, 
      egrid_subregion,
      state, 
      fuel_category, 
      plant_primary_fuel, 
      timezone, 
      lat = fcase(
        !is.na(lat_eia), lat_eia, 
        !is.na(lat_egrid), lat_egrid
      ), 
      lon = fcase(
        !is.na(lon_eia), lon_eia, 
        !is.na(lon_egrid), lon_egrid
      ),
      shaped_plant_id,
      nerc = fcase(
        !is.na(nerc_eia), nerc_eia, 
        !is.na(nerc_egrid), nerc_egrid
      ),
      namepcap = fcase(
        !is.na(namepcap_eia), namepcap_eia,
        !is.na(namepcap_egrid), namepcap_egrid
      ),
      namepcap_source = fcase(
        !is.na(namepcap_eia), 'eia',
        !is.na(namepcap_egrid), 'egrid'
      ),
      stack_height
    )]
  # 6 plants are missing lat/lons and namepcap
  ba_avg_loc = 
    oge_plant_info_dt |>
    get_vars(vars = c('ba_code','fuel_category','lat','lon','namepcap')) |>
    gby(ba_code, fuel_category) |>
    fmean()
  # Taking lat/lon from other year (4 plants) or BA-feul avg (2 plants)
  plant_missing_latlon = 
    merge(
      oge_plant_info_dt[is.na(lat)],
      eia860_plant_dt[,
        .SD[which.min(year)],
        by = plant_id_eia
      ][,-'year'], 
      by = 'plant_id_eia',
      all.x = TRUE
    ) |>
    merge(
      egrid_plant_dt[,
        .SD[which.min(year)],
        by = plant_id_camd
      ][,-'year'], 
      by.x = 'plant_id_eia', 
      by.y = 'plant_id_camd',
      all.x = TRUE
    ) |>
    merge(
      ba_avg_loc, 
      by = c('ba_code','fuel_category'),
      all.x = TRUE
    ) %>%
    .[,.(
      plant_id_eia, 
      lat_fill = coalesce(lat_eia, lat_egrid, lat.y), 
      lon_fill = coalesce(lon_eia, lon_egrid, lon.y), 
      namepcap_fill = coalesce(namepcap_eia, namepcap_egrid, namepcap.y), 
      nerc_fill = coalesce(nerc_eia, nerc_egrid)
    )]
  # Adding to main data 
  oge_plant_info_dt = 
    merge(
      oge_plant_info_dt, 
      plant_missing_latlon, 
      by = 'plant_id_eia',
      all.x = TRUE
    )[,':='(
      lat = coalesce(lat, lat_fill),
      lon = coalesce(lon, lon_fill),
      namepcap = coalesce(namepcap, namepcap_fill),
      nerc = coalesce(nerc, nerc_fill),
      lat_fill = NULL, 
      lon_fill = NULL, 
      namepcap_fill = NULL, 
      nerc_fill = NULL
    )]
  skim(oge_plant_info_dt) |> print()

# Assigning counties ----------------------------------------------------------
  print('matching plant locations to counties')
  # loading county shapes
  county_sf = map_dfr(
    unique(oge_plant_info_dt[state != '<NA>']$state), 
    counties, 
    cb = TRUE,
    year = 2019
  )
  # Merging plant location with counties 
  plant_county_dt = 
    st_as_sf(
      oge_plant_info_dt[!is.na(lat)],
      coords = c('lon','lat'),
      crs = 4326
    ) %>%  
    st_transform(crs = st_crs(county_sf)) %>% 
    st_join(county_sf, left = TRUE) %>% 
    data.table() %>% 
    .[,.(
      plant_id_eia, year, 
      fips = paste0(STATEFP, COUNTYFP)
    )]
  # Adding to the data 
  oge_plant_info_dt = 
    merge(
      oge_plant_info_dt[,-'fips'],
      plant_county_dt,
      by = c('plant_id_eia', 'year'),
      all.x = TRUE
    )
  # That didn't work for a few plants (close to borders?)
  missing_plants = st_as_sf(
    oge_plant_info_dt[fips == 'NANA'],
    coords = c('lon','lat'),
    crs = 4326
  ) %>% st_transform(crs = st_crs(county_sf))
  # Calculating distance
  fixed_plants = 
    map_dfr(
      seq_len(nrow(missing_plants)),
      \(i){
        county_sf[which.min(
          st_distance(county_sf, missing_plants[i,])
        ),] %>% 
        data.table() %>% 
        .[,.(
          plant_id_eia = missing_plants[i,]$plant_id_eia,
          year = missing_plants[i,]$year,
          fips_dist = paste0(STATEFP, COUNTYFP)
        )]
      }
  ) 
  # Adding into data 
  oge_plant_info_dt = 
    merge(
      oge_plant_info_dt, 
      fixed_plants, 
      by = c('plant_id_eia', 'year'),
      all.x = TRUE
    )[,':='(
      fips = ifelse(fips == 'NANA', fips_dist, fips),
      fips_dist = NULL
    )]
  # Making sure there aren't any plants with missing counties
  if(nrow(oge_plant_info_dt[is.na(fips) | str_detect(fips,'NA')]) >0){
    stop('Some plants are not assigned to a county')
  }
# Assigning BA Codes to nerc regions ------------------------------------------
  print('assigning nerc_adj')
  ba_nerc_dt = oge_plant_info_dt[,.N,keyby = .(ba_code, nerc)]
  # Crosswalk between BA Codes and nerc adj 
  ba_nerc_xwalk = 
    ba_nerc_dt[
      ba_nerc_dt[,.(max_index = .I[N == max(N)]), by = ba_code]$max_index, .(
        ba_code, 
        nerc_adj = fcase(
          ba_code == "MISO", "MRO",
          ba_code == "PJM", "RFC",
          ba_code %in% c("BANC","CISO","IID","LDWP","TIDC"), "CAL",
          !is.na(nerc), nerc
        )
      )]
  # Saving results for later use
  fwrite(
    ba_nerc_xwalk,
    file = here('Data/electricity-generation/ba-nerc-xwalk-oge.csv')
  )
  # Adding to the plant data 
  oge_plant_info_dt = 
    merge(
      oge_plant_info_dt[,-'nerc_adj'],
      ba_nerc_xwalk,
      by = 'ba_code'
    ) |> setkey(plant_id_eia, year)
  # Makng sure there aren't any missing
  if(nrow(oge_plant_info_dt[is.na(nerc_adj)]) >0){
    stop('Some plants are not assigned to a nerc_adj region')
  }
# Filling in missing stack heights --------------------------------------------
  print('filling in stack heights')
  # Setting to the median for each fuel type
  oge_plant_info_dt = 
    merge(
      oge_plant_info_dt,
      oge_plant_info_dt[,.(
        med_stack_height = median(stack_height, na.rm = TRUE)),
        keyby = .(fuel_category, year)
      ],
      by = c('year','fuel_category'),
      all.x = TRUE
    )
  oge_plant_info_dt[,
    stack_height := fcase(
    !is.na(stack_height), stack_height,
    is.na(stack_height) & !is.na(med_stack_height), med_stack_height,
    default = 0
  )]
  oge_plant_info_dt[,med_stack_height := NULL]
  # Checking missing 
  if(nrow(oge_plant_info_dt[is.na(stack_height)]) >0){
    stop('Some plants are not assigned a stack height')
  }
# Saving the results ----------------------------------------------------------
  write.fst(
    oge_plant_info_dt,
    path = here("Data/electricity-generation/plant-info-dt-oge.fst")
  )
  print('done.')
}
  